#!/usr/bin/perl

use POSIX;
use strict;
use Text::CSV;
use List::Util qw(min max sum);
use List::MoreUtils qw(uniq firstidx);
use Bio::AlignIO;
use Bio::SeqIO;
use Cwd;
use Cwd 'abs_path';
use Getopt::Long;
use experimental 'smartmatch';
use Bio::Tools::Run::Alignment::Muscle;
use Bio::Align::Utilities;


################################################################################
######### Setup
$|=1;

my @fourLetterCodons = ("CT","GT","GC","AC","TC","CC","CG","GG");
my $MAX_GAPS=1; ## Number of species with gaps allowed in the alignment

my $conservatoryDir=abs_path(".");
my $referenceGenomeList;

my $referenceGenomeFamily;
my %referenceGenomeSpecies;
my $treeFileName;
my $minInformativeBases = 20000;
my $numOfGenesUsed=0;
my $mode="CODON";  ## neurtal model sequence source. Either CODON or INTRON. Currently only CODON is supported
my $model="REV";
my $help=0;

my $keepTmp=0;

GetOptions ("conservatory-directory=s" => \$conservatoryDir,
			"reference=s" => \$referenceGenomeList,
			"tree=s" => \$treeFileName,
			"mode=s" => \$mode,
			"model=s" => \$model,
			"keep-tmp" => \$keepTmp,
			"help" => \$help) or die ("Error in command line arguments\n");

if($help || $referenceGenomeList eq "" || $treeFileName eq "") {
   print "\n\tbuildModel --reference <ReferenceGenome>,<referenceGenome> \n\t\tbuildModel accepts a list of comma-separated reference genomes from the same family (for which alignments have been done) and a tree file for the family\n\t\tand produces a model file for phyloP based on non-coding sequence data.\n\n";
   print "\t--conservatory-directory\tName of the conservatory directory. Default is current directory.\n";
   print "\t--reference\tComma delimited list of reference genomes (At least one is REQUIRED).\n";
   print "\t--tree\t\tFamily tree file for phyloFit (REQUIRED).\n";
   print "\t--model\t\tSubstitution model for phyloFit (Default: REV).\n";
   print "\t--mode\t\tSource for neutral model estimation. Either CODON or INTRON (used 4-variant codon position, or alignments of first intron). Only CODON is supported.\n";
   print "\t--keep-tmp\t\tDo not delete the temporary fasta file of the 3rd condon positions.\n";
   print "\t--help\t\tPrints this message.\n\n";
   
   exit();
}

my $genomedbFileName = $conservatoryDir . "/genome_database.csv";
my $defaultModelFile = "$conservatoryDir/scripts/defaultModel.mod";
die "ERROR: Cannot find file genome database file ($genomedbFileName)\n" unless -e $genomedbFileName;
die "ERROR: Cannot find file tree filename ($treeFileName)\n" unless -e $treeFileName;
die "ERROR: Conservatory directory structure at ($conservatoryDir) is corrupt\n" unless (-e "$conservatoryDir/scripts");
die "ERROR: Cannot find default mode file $defaultModelFile\n" unless (-e $defaultModelFile);

########## read genome database to get species list
my @referenceGenomes = split ',', $referenceGenomeList;

my %genomesInFamily;
my %speciesInFamily;
my %genomeToSpecies;
my %proteinNameTranslator;
my %geneNameTranslator;
my %FourVarPosSequence;
my %alignmentDirs;


open (my $genomeDatabase, "<", $genomedbFileName);
while((my $curgenomeline = <$genomeDatabase>)) {
	if(substr($curgenomeline,0,1) ne "#") { 
		my ($curgenomeName, $curgenomeSpecies,  $curgenomeFamily, $curgenomeReference, $upstreamLength, $downstreamLength, $geneNameField, $geneProcessingRegEx, $gene2SpeciesIdentifier, $proteinProcessingRegEx, $classification) = split /,/, $curgenomeline;
		$genomesInFamily{$curgenomeFamily}{$curgenomeName}=1;
		$speciesInFamily{$curgenomeFamily}{$curgenomeSpecies}=1;
	
		if(defined $genomeToSpecies{$curgenomeName}) { die "ERROR: Genome name must be unique ($curgenomeName).\n"; }
	
		$genomeToSpecies{$curgenomeName}= $curgenomeSpecies;
		$proteinNameTranslator{$curgenomeName}= $proteinProcessingRegEx;
		$geneNameTranslator{$curgenomeName}= $geneProcessingRegEx;

		if($curgenomeName ~~ @referenceGenomes) {
			if($referenceGenomeFamily eq "") { 
				$referenceGenomeFamily = $curgenomeFamily;
			}
			if($referenceGenomeFamily ne $curgenomeFamily) {
				die "ERROR: Reference genomes $referenceGenomeList are from different families ($referenceGenomeFamily,$curgenomeFamily). They must be from the same family.\n";
			}
			$referenceGenomeSpecies{$curgenomeName} = $curgenomeSpecies;
			$alignmentDirs{$curgenomeName} = "$conservatoryDir/alignments/$curgenomeName";
			die "ERROR: Conservatory directory structure at ($conservatoryDir) is corrupt. Cannot find alignment directory " . $alignmentDirs{$curgenomeName} . "\n" unless (-e $alignmentDirs{$curgenomeName});
		}
	}
}
close($genomeDatabase);

my $tmpFastaFileName = "$conservatoryDir/genomes/$referenceGenomeFamily.formodel.fasta";

die "ERROR: Cannot find family for the reference genomes ($referenceGenomeList) in the genome database file.\n" unless ($referenceGenomeFamily ne "");

if(scalar @referenceGenomes != scalar keys %referenceGenomeSpecies) {
	die "ERROR: Found only " . join(",", keys %referenceGenomeSpecies) . " reference genomes out of the list $referenceGenomeList.\n";
}
print "PROGRESS: START automatic phylogenetic model computation for family: $referenceGenomeFamily using reference genomes:" . join(",", keys %referenceGenomeSpecies) . "\n";

#### Now read the CDS and proteins files

my @speciesInReferenceFamily = keys %{ $speciesInFamily{$referenceGenomeFamily} };
my $numOfSpeciesInFamily = (scalar @speciesInReferenceFamily);
my @genomesInReferenceFamily = keys %{ $genomesInFamily{$referenceGenomeFamily} };

my $genomesDir = "$conservatoryDir/genomes/$referenceGenomeFamily";

print "PROGRESS: Reading proteins and CDS sequences for $referenceGenomeFamily.\n";
print "PROGRESS: Family contains " . scalar @speciesInReferenceFamily . " species:" . join(",", sort @speciesInReferenceFamily) . "\n";
my %proteinSeqDB;
my %cdsSeqDB;

foreach my $curGenome (@genomesInReferenceFamily) {
	print "PROGRESS: Reading $curGenome...";
	my $proteinTranslatorForGenome = $proteinNameTranslator{$curGenome};
	my $geneNameTranslatorForGenome = $geneNameTranslator{$curGenome};
	
	my $proteinSeqReader = Bio::SeqIO->new( -format => "fasta",
										   -file => "$genomesDir/$curGenome.proteins.fasta");
	my $totalProteins=0;
	my $totalCDS=0;
	while ( my $seq = $proteinSeqReader->next_seq() ) {
		### Convert seq name from protein to fullgenename
		my $locus = $seq->id();
		if($proteinTranslatorForGenome ne "") { eval '$locus =~ s/$proteinTranslatorForGenome//g;'; }
		$locus =~ s/ .*$//;
		my $fullGeneName = $genomeToSpecies{$curGenome} . "-$curGenome-$locus";
		### remove protein illegal embelishment
		my $proteinSeq = uc($seq->seq());
		$proteinSeq =~ s/\.$//;
		$proteinSeq =~ s/\*$//;

		if(length($proteinSeq) > length($proteinSeqDB{$fullGeneName}) ) {  ##If more than one sequence (spliceform?) take longest one
			$proteinSeqDB{$fullGeneName} = $proteinSeq;
		}
		$totalProteins++;
	}
	print "$totalProteins protein sequences found...";

	my $cdsSeqReader = Bio::SeqIO->new( -format => "fasta",
						-file => "$genomesDir/$curGenome.cds.fasta");
	while ( my $seq = $cdsSeqReader->next_seq() ) {
		my $locus = $seq->id();
		if($geneNameTranslatorForGenome ne "") { eval '$locus =~ s/$geneNameTranslatorForGenome//g;'; }
		$locus =~ s/ .*$//;
		my $fullGeneName = $genomeToSpecies{$curGenome} . "-$curGenome-$locus";
		## remove the stop from the CDS, if found
		## Trim the CDS to triplets
		my $cdsSeq = uc($seq->seq());
		$cdsSeq = substr($cdsSeq,0, length($cdsSeq) - (length($cdsSeq) % 3));
		$cdsSeq =~ s/(TGA|TAA|TAG)$//;

		if(length($cdsSeq) == length($proteinSeqDB{$fullGeneName})*3) {
				$cdsSeqDB{$fullGeneName} = $cdsSeq;
				$totalCDS++;
		}		
	}
	print "$totalCDS CDS sequences found.\n";
}

#### Now look for all single orthologs
print "PROGRESS: Searching for one-to-one orthologs...\n";

my %oneToOneOrthologs;

foreach my $curReferenceGenome (keys %referenceGenomeSpecies) {
	opendir my $alignmentDirReader, $alignmentDirs{$curReferenceGenome}; 
	my @CREorthologFiles = grep (/\.CREorthologs.txt/, readdir($alignmentDirReader) );

	print "PROGRESS: Reference genome $curReferenceGenome. " . scalar @CREorthologFiles . " candidates.";

	foreach my $curCREOrth (@CREorthologFiles) {
		my %speciesInOrth = {};
		my %orthForSpecies ={};
		my %orthScoreForSpecies={};

		open( my $curCREOrthFile, $alignmentDirs{$curReferenceGenome} . "/$curCREOrth") || die ("INTERNAL ERROR.\n");

		my $referenceGene = $curCREOrth;
		$referenceGene =~ s/.*\///;   # remove trailing directory structure
		$referenceGene =~ s/\..*$//;  # remove the suffix
		my $fullReferenceGeneName = $referenceGenomeSpecies{$curReferenceGenome} . "-" . $curReferenceGenome . "-" . $referenceGene;

		$oneToOneOrthologs{$curReferenceGenome}{$curCREOrth}{$referenceGenomeSpecies{$curReferenceGenome} } = $fullReferenceGeneName;

		while(my $curLine = <$curCREOrthFile>) {
			my ($genome, $geneName, $upscore, $downscore) = split /,/, $curLine;
			my $orthScore = $upscore + $downscore;
			my $species = $geneName;
			$species =~ s/-.*$//;
			if($species ne $referenceGenomeSpecies{$curReferenceGenome} && defined $cdsSeqDB{$geneName} && defined $proteinSeqDB{$geneName}) {
				if($orthScore > $orthScoreForSpecies{$species}) {
					$orthScoreForSpecies{$species} = $orthScore;
					$oneToOneOrthologs{$curReferenceGenome}{$curCREOrth}{$species} = $geneName;
				}
			}
		}
		close( $curCREOrthFile);

		## Only keep the genes that have orthologs in all species of the family
		if( ( (scalar (keys %{ $oneToOneOrthologs{$curReferenceGenome}{$curCREOrth} })) != $numOfSpeciesInFamily)) {
			delete $oneToOneOrthologs{$curReferenceGenome}{$curCREOrth};
		}
	}
	print "..." . (scalar (keys %{ $oneToOneOrthologs{$curReferenceGenome} } ) ) . " possible orthogroups.\n";
}

## Now combine the reference genome orthologs
my @orthogroupsToKeep;

if(scalar @referenceGenomes > 1) {
	print "PROGRESS: Combining orthogroup lists for " . join(",", @referenceGenomes) . "\n";
	foreach my $curOrthogroupToCheck (keys %{ $oneToOneOrthologs{ $referenceGenomes[0] }  }) {
		foreach my $curReferenceGenome ( @referenceGenomes) {
			if($curReferenceGenome ne $referenceGenomes[0]) {
				## Find the matching orthogroup
				my @matchingOrthogroupsPerGenome;
				foreach my $curParallelOrthogroupToCheck (keys %{ $oneToOneOrthologs{ $curReferenceGenome} } ) {
					if($oneToOneOrthologs{$curReferenceGenome}{$curParallelOrthogroupToCheck}{ $referenceGenomeSpecies{ $referenceGenomes[0] } } eq
					   $oneToOneOrthologs{$referenceGenomes[0]}{$curOrthogroupToCheck}{ $referenceGenomeSpecies{ $referenceGenomes[0] } } ) {
						## Check to see if all orthologs match
						my (@orthogroupOne, @orthogroupTwo);
						foreach my $curSp ( keys %{ $oneToOneOrthologs{$referenceGenomes[0]}{$curOrthogroupToCheck} } ) {
							push @orthogroupOne, $oneToOneOrthologs{$referenceGenomes[0]}{$curOrthogroupToCheck}{$curSp};
						}
						foreach my $curSp ( keys %{ $oneToOneOrthologs{$curReferenceGenome}{$curParallelOrthogroupToCheck} } ) {
							push @orthogroupTwo, $oneToOneOrthologs{$referenceGenomes[0]}{$curOrthogroupToCheck}{$curSp};
						}
						@orthogroupOne = sort @orthogroupOne;
						@orthogroupTwo = sort @orthogroupTwo;
						if( @orthogroupOne == @orthogroupTwo) {
							my $match = 1;
							foreach  my $curSpNum (0..(@orthogroupOne)) {
								if($orthogroupOne[$curSpNum] ne $orthogroupTwo[$curSpNum]) {
									$match = 0;
								}
							}
							if($match) {
								push @orthogroupsToKeep, $curOrthogroupToCheck;
							} else {
							}
						} else {
#							print "..orthogroup rejected due to mismatch in ortholog numbber.\n";
						}
					}
				} 
			}
		}
	}
} else {
	@orthogroupsToKeep = keys %{ $oneToOneOrthologs{$referenceGenomes[0]}};
}

my @orthologProteins;
my %orthologCDSs;
my $orthologCount=1;
my $informativeBaseCount;

foreach my $curCREOrtholog (@orthogroupsToKeep) {
	my $skipGene=0;
	
	@orthologProteins =  ();
	%orthologCDSs = {};

	### Build OrthologSeq DB
	for my $curSpecies (keys %{ $oneToOneOrthologs{$referenceGenomes[0]}{$curCREOrtholog} }) {
		my $proteinSeq = $proteinSeqDB{ $oneToOneOrthologs{$referenceGenomes[0]}{$curCREOrtholog}{$curSpecies} };
		my $cdsSeq = $cdsSeqDB{$oneToOneOrthologs{$referenceGenomes[0]}{$curCREOrtholog}{$curSpecies}};

		push (@orthologProteins, Bio::LocatableSeq->new(-seq => $proteinSeq,
        	                	                -id => $curSpecies,
                                                -start => 1,
                                                -end => length($proteinSeq)));
        
        $orthologCDSs{ $curSpecies } = Bio::LocatableSeq->new(-seq => $cdsSeq,
      	                	            -id => $curSpecies,
        	                            -start => 1,
                	                    -end => length($cdsSeq) );

		if(length($cdsSeq) != length($proteinSeq)*3) {
			print "WARNING: Bad CDS data for " . $oneToOneOrthologs{$referenceGenomes[0]}{$curCREOrtholog}{$referenceGenomeSpecies{$referenceGenomes[0]} } . ":" . $oneToOneOrthologs{$referenceGenomes[0]}{$curCREOrtholog}{$curSpecies} . ". (" . length($proteinSeq)*3 . ":" . length($cdsSeq). "). Skipping gene. \n";
			$skipGene = 1;
		}
	}

	if(!$skipGene) {
       	print "PROGRESS (" . $orthologCount++ . "): $curCREOrtholog: " . $oneToOneOrthologs{$referenceGenomes[0]}{$curCREOrtholog}{$referenceGenomeSpecies{$referenceGenomes[0]} } . ". ";
		my $aligner = Bio::Tools::Run::Alignment::Muscle->new(('quiet' => 1));
       	my $proteinAlignment = $aligner->align(\@orthologProteins);
		print "Alignment: Protein (" . $proteinAlignment->length . "," . $proteinAlignment->num_sequences . ")";

   		my $cdsAlignment = Bio::Align::Utilities::aa_to_dna_aln($proteinAlignment, \%orthologCDSs);	
		print ";DNA (" .  $cdsAlignment->length . "," . $cdsAlignment->num_sequences . "). ";

		if($cdsAlignment->length() >0) {
	    	my %fourPositionNucleotide;
      		my @conservation = $cdsAlignment->consensus_conservation();
      		my $consensus_sequence = $cdsAlignment->consensus_string(); 
			my @gapMatrix = @{ $cdsAlignment->gap_col_matrix() };
      		my $curPos=0;
       		my @columnsToRemove;
			my $informativeBaseCountInOrthogroup;

      		while($curPos < $cdsAlignment->length() ) {
				my $gapAtPos = sum( values %{ $gapMatrix[$curPos] } );

      			if($conservation[$curPos] == 100 && $conservation[$curPos+1] ==100  && substr($consensus_sequence,$curPos, 2) ~~ @fourLetterCodons) {
   					### If perfect conservation for position 1 and 2 of the codon;
					if($conservation[$curPos+2] < 100) { $informativeBaseCount++; $informativeBaseCountInOrthogroup++; }
   					push @columnsToRemove, [($curPos), ($curPos +1) ];
				} elsif (substr($consensus_sequence,$curPos, 2) ~~ @fourLetterCodons && $gapAtPos<= $MAX_GAPS) { 
   					push @columnsToRemove, [($curPos), ($curPos +1) ];
				} else {
       				push @columnsToRemove, [($curPos),($curPos + 2) ]
   				}
   				$curPos+=3;
   			}
   			my $just4posVar = $cdsAlignment->remove_columns(@columnsToRemove);	
   			print "Filter (" . $just4posVar->length() . "," . $just4posVar->num_sequences . "). ";
   			if($just4posVar->length() >0 ) {
				print "4-Fold variants (" . $just4posVar->length() . ",$informativeBaseCountInOrthogroup).\n";
    			foreach my $seq ($just4posVar->each_seq) {
   					$FourVarPosSequence{ $seq->id() } .= $seq->seq();
   				}
   			} else { print "\n"; }
 		}
	}
	if($informativeBaseCount > $minInformativeBases) {
		last;
	}
}


####### Now dump fasta
open my $tmpFasta, ">$tmpFastaFileName";
foreach my $fastaSeq ( keys %FourVarPosSequence ) {
   	print $tmpFasta ">$fastaSeq\n" . $FourVarPosSequence{$fastaSeq} . "\n";
}
close $tmpFasta;
print "PROGRESS: Alignment FASTA file assembled.\n";

####### make Model
print "PROGRESS: Prepare starting model file";
my $startModelTmpFileName = "$referenceGenomeFamily.mod.tmp";
system("cp $defaultModelFile $startModelTmpFileName");
open (my $treeFile, "<", $treeFileName);
my $treeLine = <$treeFile>;
chomp($treeLine);
close($treeFile);
open(my $tmpModelFile , ">>" , $startModelTmpFileName);
print $tmpModelFile "TREE: $treeLine\n";
close($tmpModelFile);

print "PROGRESS: START computing model.\n";
system("phyloFit --init-model $startModelTmpFileName --EM --precision MED --nrates 4 --scale-only --subst-mod $model --out-root $referenceGenomeFamily $tmpFastaFileName");
unlink($startModelTmpFileName);
#system("phyloFit --tree $treeFileName --nrates 4 --subst-mod HKY85+GAP --out-root $referenceGenome $tmpFastaFileName");
print "PROGRESS: END computing model.\n";
if(!$keepTmp) { unlink($tmpFastaFileName); }

system("mv -f $referenceGenomeFamily.mod $conservatoryDir/genomes/$referenceGenomeFamily.mod");
print "DONE.\n";
